{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-10T16:28:14.869549Z",
     "iopub.status.busy": "2021-07-10T16:28:14.868977Z",
     "iopub.status.idle": "2021-07-10T16:28:24.380797Z",
     "shell.execute_reply": "2021-07-10T16:28:24.379799Z",
     "shell.execute_reply.started": "2021-07-10T16:28:14.869500Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
    "\n",
    "# AutoTokenizer：输入文本的进行分词，并进行转换编码的操作\n",
    "# AutoModelForSequenceClassification：加载置顶的模型，并将输出[CLS]加上全连接进行文本分类\n",
    "\n",
    "model_name = \"bert-base-chinese\" # 可以自己修改\n",
    "\n",
    "pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-10T16:28:43.942177Z",
     "iopub.status.busy": "2021-07-10T16:28:43.941642Z",
     "iopub.status.idle": "2021-07-10T16:28:43.947178Z",
     "shell.execute_reply": "2021-07-10T16:28:43.946568Z",
     "shell.execute_reply.started": "2021-07-10T16:28:43.942131Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [101, 791, 1921, 1921, 3698, 2523, 1962, 8024, 852, 3209, 1921, 833, 678, 7433, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs = tokenizer(\"me\")\n",
    "inputs\n",
    "# input_ids：\n",
    "# token_type_ids：\n",
    "# attention_mask："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-10T16:29:19.811054Z",
     "iopub.status.busy": "2021-07-10T16:29:19.810515Z",
     "iopub.status.idle": "2021-07-10T16:29:19.817458Z",
     "shell.execute_reply": "2021-07-10T16:29:19.816364Z",
     "shell.execute_reply.started": "2021-07-10T16:29:19.811009Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "pt_batch = tokenizer(\n",
    "    [\"hello\", \"now\"],\n",
    "    padding=True,\n",
    "    truncation=True,\n",
    "    max_length=512,\n",
    "    return_tensors=\"pt\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-10T16:29:20.209047Z",
     "iopub.status.busy": "2021-07-10T16:29:20.208536Z",
     "iopub.status.idle": "2021-07-10T16:29:20.215541Z",
     "shell.execute_reply": "2021-07-10T16:29:20.214458Z",
     "shell.execute_reply.started": "2021-07-10T16:29:20.209001Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input_ids: [[101, 791, 1921, 678, 7433, 102, 0, 0, 0, 0], [101, 3209, 1921, 678, 7433, 1400, 1921, 678, 7433, 102]]\n",
      "token_type_ids: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
      "attention_mask: [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]\n"
     ]
    }
   ],
   "source": [
    "for key, value in pt_batch.items():\n",
    "    print(f\"{key}: {value.numpy().tolist()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-10T16:42:33.618263Z",
     "iopub.status.busy": "2021-07-10T16:42:33.617692Z",
     "iopub.status.idle": "2021-07-10T16:42:38.171868Z",
     "shell.execute_reply": "2021-07-10T16:42:38.170897Z",
     "shell.execute_reply.started": "2021-07-10T16:42:33.618215Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from transformers import BertTokenizer\n",
    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")\n",
    "\n",
    "sequence = \"A Titan RTX has 24GB of VRAM\"\n",
    "tokenized_sequence = tokenizer.tokenize(sequence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-10T16:42:38.172981Z",
     "iopub.status.busy": "2021-07-10T16:42:38.172786Z",
     "iopub.status.idle": "2021-07-10T16:42:38.176551Z",
     "shell.execute_reply": "2021-07-10T16:42:38.176026Z",
     "shell.execute_reply.started": "2021-07-10T16:42:38.172964Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['A',\n",
       " 'Titan',\n",
       " 'R',\n",
       " '##T',\n",
       " '##X',\n",
       " 'has',\n",
       " '24',\n",
       " '##GB',\n",
       " 'of',\n",
       " 'V',\n",
       " '##RA',\n",
       " '##M']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-10T16:42:43.226055Z",
     "iopub.status.busy": "2021-07-10T16:42:43.225524Z",
     "iopub.status.idle": "2021-07-10T16:42:43.231297Z",
     "shell.execute_reply": "2021-07-10T16:42:43.230292Z",
     "shell.execute_reply.started": "2021-07-10T16:42:43.226009Z"
    }
   },
   "outputs": [],
   "source": [
    "inputs = tokenizer(sequence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-10T16:42:50.235952Z",
     "iopub.status.busy": "2021-07-10T16:42:50.235433Z",
     "iopub.status.idle": "2021-07-10T16:42:50.241011Z",
     "shell.execute_reply": "2021-07-10T16:42:50.240507Z",
     "shell.execute_reply.started": "2021-07-10T16:42:50.235905Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-10T16:43:10.712399Z",
     "iopub.status.busy": "2021-07-10T16:43:10.711834Z",
     "iopub.status.idle": "2021-07-10T16:43:16.185772Z",
     "shell.execute_reply": "2021-07-10T16:43:16.185210Z",
     "shell.execute_reply.started": "2021-07-10T16:43:10.712352Z"
    }
   },
   "outputs": [],
   "source": [
    "from transformers import BertTokenizer\n",
    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")\n",
    "\n",
    "sequence_a = \"This is a short sequence.\"\n",
    "sequence_b = \"This is a rather long sequence. It is at least longer than the sequence A.\"\n",
    "\n",
    "encoded_sequence_a = tokenizer(sequence_a)[\"input_ids\"]\n",
    "encoded_sequence_b = tokenizer(sequence_b)[\"input_ids\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-10T16:43:16.186812Z",
     "iopub.status.busy": "2021-07-10T16:43:16.186621Z",
     "iopub.status.idle": "2021-07-10T16:43:16.189833Z",
     "shell.execute_reply": "2021-07-10T16:43:16.189424Z",
     "shell.execute_reply.started": "2021-07-10T16:43:16.186795Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[101, 1188, 1110, 170, 1603, 4954, 119, 102]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoded_sequence_a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-11T18:31:50.882727Z",
     "iopub.status.busy": "2021-07-11T18:31:50.882281Z",
     "iopub.status.idle": "2021-07-11T18:31:55.088013Z",
     "shell.execute_reply": "2021-07-11T18:31:55.086956Z",
     "shell.execute_reply.started": "2021-07-11T18:31:50.882693Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from transformers import BertTokenizer\n",
    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")\n",
    "sequence_a = \"HuggingFace is based in NYC. So my\"\n",
    "sequence_b = \"Where is HuggingFace based?\"\n",
    "\n",
    "encoded_dict = tokenizer(sequence_a)\n",
    "decoded = tokenizer.decode(encoded_dict[\"input_ids\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-11T18:31:58.636084Z",
     "iopub.status.busy": "2021-07-11T18:31:58.635532Z",
     "iopub.status.idle": "2021-07-11T18:31:58.641397Z",
     "shell.execute_reply": "2021-07-11T18:31:58.640814Z",
     "shell.execute_reply.started": "2021-07-11T18:31:58.636038Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'[CLS] HuggingFace is based in NYC. So my [SEP]'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "decoded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-11T18:31:59.258210Z",
     "iopub.status.busy": "2021-07-11T18:31:59.257692Z",
     "iopub.status.idle": "2021-07-11T18:31:59.411923Z",
     "shell.execute_reply": "2021-07-11T18:31:59.410914Z",
     "shell.execute_reply.started": "2021-07-11T18:31:59.258165Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 119, 1573, 1139, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoded_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
